# Module 4 Demo — Date & Text Handling
# Hands-on practice with lubridate, stringr, and forcats for clinical programming
# Integrates R4DS Chapters 14 (Strings), 15 (Regular expressions), 16 (Factors)

# ----------------------------
# 📦 Load Required Libraries
# ----------------------------

library(dplyr)
library(tibble)
library(lubridate)
library(stringr)   # R4DS Chapter 14: Strings
library(forcats)   # R4DS Chapter 16: Factors

# ----------------------------
# 📅 Part 1: Date Conversion Practice
# ----------------------------

# Create sample adverse events data with different date formats
ae_raw <- tibble(
  USUBJID = c("001-001", "001-001", "001-002", "001-002", "001-003"),
  AEDECOD = c("HEADACHE", "NAUSEA", "FATIGUE", "DIZZINESS", "RASH"),
  AESTDTC_RAW = c("2024-01-20", "25/01/2024", "01/18/2024", "2024-01-22", "20240125"),
  RFSTDTC = c("2024-01-15", "2024-01-15", "2024-01-16", "2024-01-16", "2024-01-15"),
  AETM = c("08:30", "14:15", "09:45", "16:20", "11:30")
)

print("Raw adverse events data:")
print(ae_raw)

# Practice different date parsing functions
ae_dates <- ae_raw %>%
  mutate(
    # Try to parse dates automatically
    AESTDT_AUTO = case_when(
      str_detect(AESTDTC_RAW, "^\\d{4}-\\d{2}-\\d{2}$") ~ ymd(AESTDTC_RAW),
      str_detect(AESTDTC_RAW, "^\\d{2}/\\d{2}/\\d{4}$") ~ dmy(AESTDTC_RAW),
      str_detect(AESTDTC_RAW, "^\\d{2}/\\d{2}/\\d{4}$") ~ mdy(AESTDTC_RAW),
      str_detect(AESTDTC_RAW, "^\\d{8}$") ~ ymd(AESTDTC_RAW),
      TRUE ~ as.Date(NA)
    ),

    # Convert reference start date
    RFSTDT = ymd(RFSTDTC),

    # Show the parsing results
    DATE_FORMAT = case_when(
      str_detect(AESTDTC_RAW, "^\\d{4}-\\d{2}-\\d{2}$") ~ "ISO (YYYY-MM-DD)",
      str_detect(AESTDTC_RAW, "^\\d{2}/\\d{2}/\\d{4}$") ~ "European (DD/MM/YYYY)",
      str_detect(AESTDTC_RAW, "^\\d{2}/\\d{2}/\\d{4}$") ~ "US (MM/DD/YYYY)",
      str_detect(AESTDTC_RAW, "^\\d{8}$") ~ "Compact (YYYYMMDD)",
      TRUE ~ "Unknown"
    )
  )

print("\nDates parsed:")
print(ae_dates)

# ----------------------------
# 📊 Part 2: Study Day Calculations
# ----------------------------

# Calculate AESTDY (study day)
ae_with_studyday <- ae_dates %>%
  mutate(
    # Basic study day calculation
    AESTDY = as.numeric(AESTDT_AUTO - RFSTDT) + 1,

    # Handle edge cases
    AESTDY_SAFE = case_when(
      is.na(AESTDT_AUTO) | is.na(RFSTDT) ~ NA_real_,
      AESTDT_AUTO < RFSTDT ~ as.numeric(AESTDT_AUTO - RFSTDT),  # Negative days (pre-treatment)
      TRUE ~ as.numeric(AESTDT_AUTO - RFSTDT) + 1               # Positive days (post-treatment)
    ),

    # Create study day categories
    STUDYDAY_PERIOD = case_when(
      is.na(AESTDY_SAFE) ~ "Missing",
      AESTDY_SAFE <= 0 ~ "Pre-treatment",
      AESTDY_SAFE <= 7 ~ "Week 1",
      AESTDY_SAFE <= 14 ~ "Week 2",
      AESTDY_SAFE <= 28 ~ "Month 1",
      TRUE ~ "After Month 1"
    )
  )

print("\nWith study day calculations:")
print(ae_with_studyday)

# ----------------------------
# 📝 Part 3: String Manipulation Practice (R4DS Chapter 14)
# ----------------------------

cat("\n=== R4DS Chapter 14: Strings Practice ===\n")

# Create messy adverse event terms (realistic clinical data scenario)
ae_messy <- tibble(
  USUBJID = c("001-001", "001-002", "001-003", "001-004", "001-005"),
  AEDECOD_RAW = c(
    "  mild HEADACHE  ",
    "NAUSEA (moderate)",
    "severe FATIGUE",
    "Dizziness - mild",
    "  RASH  moderate  "
  ),
  DOSE_INFO = c("10mg once daily", "5 MG twice daily", "20mg QD", "15 mg BID", "25mg daily"),
  TREATMENT_ARM = c("Active", "Placebo", "Active", "Active", "Placebo"),
  SEVERITY = c("Mild", "Moderate", "Severe", "Mild", "Moderate")
)

print("\nMessy AE terms:")
print(ae_messy)

# Clean and standardize text
ae_cleaned <- ae_messy %>%
  mutate(
    # Basic cleaning
    AEDECOD_CLEAN = AEDECOD_RAW %>%
      str_trim() %>%                                    # Remove leading/trailing spaces
      str_to_upper() %>%                               # Convert to uppercase
      str_replace_all("\\s+", " ") %>%                 # Replace multiple spaces with single
      str_replace_all("\\([^)]*\\)", "") %>%           # Remove parentheses and contents
      str_replace_all(" - ", " ") %>%                  # Remove dashes
      str_trim(),                                      # Trim again

    # Extract severity from the term
    SEVERITY_EXTRACTED = case_when(
      str_detect(AEDECOD_RAW, "(?i)mild") ~ "MILD",
      str_detect(AEDECOD_RAW, "(?i)moderate") ~ "MODERATE",
      str_detect(AEDECOD_RAW, "(?i)severe") ~ "SEVERE",
      TRUE ~ "UNKNOWN"
    ),

    # Extract base term (remove severity qualifiers)
    AETERM_BASE = AEDECOD_CLEAN %>%
      str_replace_all("^(MILD|MODERATE|SEVERE)\\s+", "") %>%  # Remove severity at start
      str_replace_all("\\s+(MILD|MODERATE|SEVERE)$", ""),     # Remove severity at end

    # Create flags based on text patterns
    HEADACHE_FLAG = ifelse(str_detect(AETERM_BASE, "HEADACHE"), "Y", "N"),
    GI_FLAG = ifelse(str_detect(AETERM_BASE, "NAUSEA|VOMITING|DIARRHEA"), "Y", "N"),

    # Clean dose information
    DOSE_CLEAN = DOSE_INFO %>%
      str_to_upper() %>%
      str_replace_all("ONCE DAILY|QD", "QD") %>%
      str_replace_all("TWICE DAILY|BID", "BID") %>%
      str_replace_all("\\s+", " ") %>%
      str_trim(),

    # Extract numeric dose
    DOSE_NUMERIC = as.numeric(str_extract(DOSE_INFO, "\\d+")),

    # Extract dose frequency
    DOSE_FREQ = case_when(
      str_detect(DOSE_CLEAN, "QD|DAILY") ~ "QD",
      str_detect(DOSE_CLEAN, "BID|TWICE") ~ "BID",
      TRUE ~ "OTHER"
    )
  )

print("\nCleaned AE data:")
print(ae_cleaned)

# ----------------------------
# 🔍 Part 4: Regular Expressions Practice (R4DS Chapter 15)
# ----------------------------

cat("\n=== R4DS Chapter 15: Regular Expressions Practice ===\n")

# Create clinical data with patterns to match
clinical_patterns <- tibble(
  USUBJID = paste0("00", 1:8, "-", sprintf("%03d", 1:8)),
  VISIT = c("Screening", "Day 1", "Week 2", "Week 4", "Month 3", "End of Study", "Follow-up", "Unscheduled"),
  LAB_RESULT = c("WBC: 7.2 K/uL", "Hgb: 12.5 g/dL", "PLT: 250 K/uL", "GLU: 95 mg/dL",
                 "CREAT: 1.1 mg/dL", "ALT: 45 U/L", "Total Chol: 180 mg/dL", "HbA1c: 6.2%"),
  CONMED = c("Aspirin 81mg daily", "Metformin 500mg BID", "Lisinopril 10mg QD",
             "Atorvastatin 20mg HS", "Insulin 10 units", "Vitamin D3 1000 IU",
             "Omega-3 1g daily", "No concomitant medications")
)

# Regular expression patterns for clinical data validation
regex_patterns <- clinical_patterns %>%
  mutate(
    # Extract numeric values from lab results
    LAB_VALUE = as.numeric(str_extract(LAB_RESULT, "\\d+\\.?\\d*")),

    # Extract lab test name (everything before the colon)
    LAB_TEST = str_extract(LAB_RESULT, "^[^:]+"),

    # Extract units (everything after the number)
    LAB_UNIT = str_extract(LAB_RESULT, "(?<=\\d\\s)[A-Za-z/%]+.*$"),

    # Validate subject ID format (should be ###-###)
    VALID_SUBJID = str_detect(USUBJID, "^\\d{3}-\\d{3}$"),

    # Extract visit number/timepoint
    VISIT_TIME = case_when(
      str_detect(VISIT, "Day\\s+(\\d+)") ~ str_extract(VISIT, "\\d+"),
      str_detect(VISIT, "Week\\s+(\\d+)") ~ paste0("W", str_extract(VISIT, "\\d+")),
      str_detect(VISIT, "Month\\s+(\\d+)") ~ paste0("M", str_extract(VISIT, "\\d+")),
      TRUE ~ str_to_upper(str_replace_all(VISIT, "\\s+", "_"))
    ),

    # Extract medication dose (number + unit)
    MED_DOSE = str_extract(CONMED, "\\d+\\s?[a-zA-Z]+"),

    # Extract medication frequency
    MED_FREQ = case_when(
      str_detect(CONMED, "(?i)daily|QD") ~ "QD",
      str_detect(CONMED, "(?i)BID|twice") ~ "BID",
      str_detect(CONMED, "(?i)TID|three times") ~ "TID",
      str_detect(CONMED, "(?i)QID|four times") ~ "QID",
      str_detect(CONMED, "(?i)HS|bedtime") ~ "HS",
      str_detect(CONMED, "(?i)units") ~ "PRN",
      TRUE ~ "Other"
    ),

    # Flag high-risk medications
    HIGH_RISK_MED = case_when(
      str_detect(CONMED, "(?i)insulin|warfarin|digoxin") ~ "High Risk",
      str_detect(CONMED, "(?i)aspirin|metformin|statin") ~ "Moderate Risk",
      str_detect(CONMED, "(?i)vitamin|omega") ~ "Low Risk",
      str_detect(CONMED, "(?i)no.*medication") ~ "None",
      TRUE ~ "Review Required"
    )
  )

print("\nRegular expression pattern matching:")
print(regex_patterns)

# Advanced regex examples for clinical validation
cat("\nAdvanced regex validation:\n")

# Phone number validation (clinical contact info)
phone_numbers <- c("(555) 123-4567", "555-123-4567", "555.123.4567", "5551234567", "invalid")
valid_phones <- str_detect(phone_numbers, "^(\\(\\d{3}\\)|\\d{3})[-.\\s]?\\d{3}[-.\\s]?\\d{4}$")
cat("Valid phone numbers:", phone_numbers[valid_phones], "\n")

# Email validation (for clinical sites)
emails <- c("site@clinic.com", "invalid.email", "test@example.org", "bad@", "@missing.com")
valid_emails <- str_detect(emails, "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$")
cat("Valid emails:", emails[valid_emails], "\n")

# ----------------------------
# 📊 Part 5: Factor Management Practice (R4DS Chapter 16)
# ----------------------------

cat("\n=== R4DS Chapter 16: Factors Practice ===\n")

# Create clinical factors for analysis
ae_factors <- ae_cleaned %>%
  mutate(
    # Convert severity to ordered factor
    SEVERITY_FACTOR = factor(SEVERITY_EXTRACTED,
                           levels = c("MILD", "MODERATE", "SEVERE"),
                           ordered = TRUE),

    # Convert treatment arm to factor with meaningful labels
    TREATMENT_FACTOR = factor(TREATMENT_ARM,
                            levels = c("Placebo", "Active"),
                            labels = c("Placebo", "Active Treatment")),

    # Create AE category factor
    AE_CATEGORY = case_when(
      str_detect(AETERM_BASE, "HEADACHE|DIZZINESS") ~ "Neurological",
      str_detect(AETERM_BASE, "NAUSEA|FATIGUE") ~ "General",
      str_detect(AETERM_BASE, "RASH") ~ "Dermatological",
      TRUE ~ "Other"
    ) %>% factor(),

    # Create dose group factor
    DOSE_GROUP = case_when(
      DOSE_NUMERIC <= 10 ~ "Low",
      DOSE_NUMERIC <= 20 ~ "Medium",
      DOSE_NUMERIC > 20 ~ "High",
      TRUE ~ "Unknown"
    ) %>% factor(levels = c("Low", "Medium", "High", "Unknown"))
  )

print("\nFactor analysis:")
print(ae_factors)

# Factor manipulation examples
cat("\nFactor level counts:\n")
print(table(ae_factors$SEVERITY_FACTOR))
print(table(ae_factors$AE_CATEGORY))

# Reorder factors by frequency
ae_factors <- ae_factors %>%
  mutate(
    AE_CATEGORY_ORDERED = fct_infreq(AE_CATEGORY),
    TREATMENT_RELEVEL = fct_relevel(TREATMENT_FACTOR, "Active Treatment")
  )

cat("\nFactor reordering:\n")
print(levels(ae_factors$AE_CATEGORY_ORDERED))
print(levels(ae_factors$TREATMENT_RELEVEL))

# ----------------------------
# 📊 Part 6: Combining Date and Text Operations
# ----------------------------

# Comprehensive example combining dates, strings, regex, and factors
final_ae <- ae_with_studyday %>%
  left_join(ae_factors, by = "USUBJID") %>%
  mutate(
    # Combine clean AE term with study day info
    AE_SUMMARY = paste0(AETERM_BASE, " (Day ", AESTDY_SAFE, ")"),

    # Create comprehensive AE description with factor labels
    AE_DESCRIPTION = paste0(
      AETERM_BASE, " - ",
      as.character(SEVERITY_FACTOR), " severity, ",
      "occurred on study day ", AESTDY_SAFE,
      " (", STUDYDAY_PERIOD, "), ",
      "Category: ", AE_CATEGORY,
      ", Treatment: ", TREATMENT_FACTOR
    ),

    # Advanced pattern-based risk assessment
    RISK_SCORE = case_when(
      SEVERITY_FACTOR == "SEVERE" & AESTDY_SAFE <= 7 ~ "High",
      SEVERITY_FACTOR == "MODERATE" & str_detect(AETERM_BASE, "CARDIAC|NEURO") ~ "High",
      SEVERITY_FACTOR == "MILD" ~ "Low",
      TRUE ~ "Medium"
    ) %>% factor(levels = c("Low", "Medium", "High"), ordered = TRUE),

    # Validate data quality using regex patterns
    DATA_QUALITY = case_when(
      is.na(AESTDY_SAFE) ~ "Missing study day",
      is.na(AETERM_BASE) | AETERM_BASE == "" ~ "Missing AE term",
      !str_detect(USUBJID, "^\\d{3}-\\d{3}$") ~ "Invalid subject ID",
      SEVERITY_EXTRACTED == "UNKNOWN" ~ "Missing severity",
      TRUE ~ "Complete"
    )
  ) %>%
  select(USUBJID, AEDECOD_RAW, AETERM_BASE, SEVERITY_FACTOR, AE_CATEGORY,
         TREATMENT_FACTOR, AESTDY_SAFE, STUDYDAY_PERIOD, RISK_SCORE,
         AE_DESCRIPTION, DATA_QUALITY)

print("\nFinal comprehensive AE dataset:")
print(final_ae)

# ----------------------------
# 🤖 Part 7: GitHub Copilot in RStudio Practice
# ----------------------------

cat("\n=== GitHub Copilot in RStudio Practice ===\n")
cat("Try writing these comments and see what Copilot suggests in RStudio:\n\n")

# Calculate days between AE start and study end date


# Extract the first word from adverse event term using regex


# Create a factor for severity with custom ordering


# Flag AEs that occurred within 30 days of treatment start
final_ae <- final_ae %>%
  mutate(EARLY_AE = ifelse(AESTDY_SAFE <= 30 & AESTDY_SAFE > 0, "Y", "N"))

# Validate email addresses in investigator contact information


# Create ordered factors for dose escalation study


# ----------------------------
# Module 4 Demo Complete!
# ----------------------------

cat("\n🎉 Module 4 Demo Complete!\n")
cat("You've practiced R4DS concepts:\n")
cat("- Date parsing with lubridate (ymd, dmy, mdy)\n")
cat("- Study day calculations (AESTDY)\n")
cat("- String manipulation with stringr (R4DS Ch. 14)\n")
cat("- Regular expressions for pattern matching (R4DS Ch. 15)\n")
cat("- Factor management with forcats (R4DS Ch. 16)\n")
cat("- Clinical data validation using regex patterns\n")
cat("- Ordered factors for severity and risk assessment\n")
cat("- Combining dates, strings, regex, and factors\n")
cat("- GitHub Copilot in RStudio assistance\n")
cat("\nReady for Module 5: Functions & Macro Translation!\n")
